{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import torch\n",
    "import time\n",
    "import random\n",
    "import os\n",
    "\n",
    "from keras.preprocessing import sequence\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "\n",
    "from keras.preprocessing import sequence\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "\n",
    "from keras.layers import Input, Dense, Conv1D, Dropout, MaxPooling1D, Flatten, Embedding, concatenate, LSTM, Activation\n",
    "from keras.models import Model\n",
    "from keras.models import Sequential\n",
    "\n",
    "from keras.utils import np_utils\n",
    "\n",
    "import torch\n",
    "from torch.utils import data\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('data/train_one_label.csv')\n",
    "test = pd.read_csv(\"data/test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "def word_vectors(train, test):\n",
    "    '''\n",
    "    使用keras对文本数据做预处理\n",
    "    '''\n",
    "    tokenizer = Tokenizer(num_words=1000)\n",
    "    tokenizer.fit_on_texts(train['comment_text'])\n",
    "\n",
    "    train_seq = tokenizer.texts_to_sequences(train['comment_text'])\n",
    "    test_seq = tokenizer.texts_to_sequences(test['comment_text'])\n",
    "\n",
    "    x_train = sequence.pad_sequences(train_seq, maxlen=200)  # shape  (* , 200)\n",
    "    y_train = train['toxic']\n",
    "    x_test = sequence.pad_sequences(test_seq, maxlen=200)  # shape (* , 200)\n",
    "    return torch.tensor(x_train).long(), torch.tensor(y_train.values).long(), torch.tensor(x_test).long()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train, y_train, x_test = word_vectors(train, test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = data.TensorDataset(x_train, y_train)\n",
    "test = data.TensorDataset(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_iter = data.DataLoader(train, batch_size=8, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "class LSTM(nn.Module):\n",
    "\n",
    "    def __init__(self):\n",
    "        super(LSTM, self).__init__()\n",
    "        self.word_embeddings = nn.Embedding(1000, 300)  # embedding之后的shape: torch.Size([200, 8, 300])\n",
    "        # 若使用预训练的词向量，需在此处指定预训练的权重\n",
    "        # embedding.weight.data.copy_(weight_matrix)\n",
    "        self.lstm = nn.LSTM(input_size=300, hidden_size=128, num_layers=1)  # torch.Size([200, 8, 128])\n",
    "        self.decoder = nn.Linear(128, 2)\n",
    "\n",
    "    def forward(self, sentence):\n",
    "        embeds = self.word_embeddings(sentence)\n",
    "        lstm_out = self.lstm(embeds)[0]  # torch.Size([8, 200, 128])\n",
    "        # 取最后一个时间步\n",
    "        final = lstm_out[:, -1, :]  # 8*128\n",
    "        y = self.decoder(final)  # 8*2 \n",
    "        return y\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = LSTM()\n",
    "model.train()\n",
    "optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)\n",
    "loss_funtion = F.cross_entropy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0.7846, grad_fn=<NllLossBackward>)\n",
      "tensor(0.7741, grad_fn=<NllLossBackward>)\n",
      "tensor(0.8200, grad_fn=<NllLossBackward>)\n",
      "tensor(0.8240, grad_fn=<NllLossBackward>)\n"
     ]
    }
   ],
   "source": [
    "for epoch, batch in enumerate(train_iter):\n",
    "    optimizer.zero_grad()\n",
    "    predicted = model(batch[0])\n",
    "\n",
    "    loss = loss_funtion(predicted, batch[1])\n",
    "    loss.backward()\n",
    "    optimizer.step()\n",
    "    print(loss)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 补充：使用Keras搭建LSTM模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_features = 1000   # 词汇表大小\n",
    "# cut texts after this number of words (among top max_features most common words)\n",
    "# 裁剪文本为 maxlen 大小的长度（取最后部分，基于前 max_features 个常用词）\n",
    "maxlen = 200 \n",
    "batch_size = 8  # 批数据量大小\n",
    "\n",
    "model = Sequential()\n",
    "# 嵌入层，每个词维度为128\n",
    "model.add(Embedding(max_features, 128))\n",
    "# LSTM层，输出维度128，可以尝试着换成 GRU 试试\n",
    "model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))  # try using a GRU instead, for fun\n",
    "model.add(Dense(1))   # 单神经元全连接层\n",
    "model.add(Activation('sigmoid'))   # sigmoid 激活函数层"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss='binary_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 22 samples, validate on 3 samples\n",
      "Epoch 1/3\n",
      "22/22 [==============================] - 2s 96ms/step - loss: 0.6901 - acc: 0.6364 - val_loss: 0.6740 - val_acc: 1.0000\n",
      "Epoch 2/3\n",
      "22/22 [==============================] - 1s 35ms/step - loss: 0.6653 - acc: 0.8636 - val_loss: 0.6378 - val_acc: 1.0000\n",
      "Epoch 3/3\n",
      "22/22 [==============================] - 1s 32ms/step - loss: 0.6218 - acc: 0.8636 - val_loss: 0.5511 - val_acc: 1.0000\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x127685748>"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(x_train.numpy(), y_train.numpy(), \n",
    "          validation_split=0.1, \n",
    "          batch_size=batch_size, \n",
    "          epochs=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:pytorch]",
   "language": "python",
   "name": "conda-env-pytorch-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
